ASER Pakistan 2016

In this piece of paper, a set of data obtained from Annual Status of Education Report (ASER) is explored. The raw data was downloaded from the link here. https://palnetwork.org/aser-centre/

Preparation

Packages Used

library(tidyverse)
library(ggplot2)
library(ggthemes)
library(ggrepel)
library(gghighlight)
library(stringr)
library(dplyr)
library(sf)
library(scatterplot3d)
library(car)
library(ResourceSelection) #to excute Hosmer-Lemeshow test
## Warning: package 'ResourceSelection' was built under R version 4.0.3
# library(broom)
require(ggiraph)
## Warning: package 'ggiraph' was built under R version 4.0.3
require(ggiraphExtra)
# require(plyr)

Data Installation

ASER2016

provdist <- read.csv("aser/ASER2016ProvDist.csv")
school <- read.csv("aser/ASER2016GSchool.csv")
child <- read.csv("aser/ASER2016Child.csv")
pschool <- read.csv("aser/ASER2016PvtSchool.csv")
gschool <- read.csv("aser/ASER2016GSchool.csv")
parent <- read.csv("aser/ASER2016Parent.csv")
house <- read.csv("aser/ASER2016HouseholdIndicators.csv")
RegionName <- c("2" = "Panjab", 
                "3" = "Sindh", 
                "4" = "Balochistan", 
                "5" = "Khyber Pakhtunkhwa", 
                "6" = "Gilgit-Baltistan", 
                "7" = "Azad Jammu and Kashmir", 
                "8" = "Islamabad - ICT", 
                "9" = "Federally Administrated Tribal Areas")
Gender <- c("0" = "Male",
            "-1" = "Female")

Spatial Data

ica <- sf::st_read("map/pak_ica_categories_areas_geonode_apr2017.shp")
## Reading layer `pak_ica_categories_areas_geonode_apr2017' from data source `C:\Program Files\R\capstone\map\pak_ica_categories_areas_geonode_apr2017.shp' using driver `ESRI Shapefile'
## Simple feature collection with 156 features and 8 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 60.8786 ymin: 23.69468 xmax: 77.83397 ymax: 37.08942
## geographic CRS: WGS 84

Data Wrangling

ica_df <- ica %>% 
  mutate(centroid = st_centroid(geometry),
    x = st_coordinates(centroid)[,1],
    y = st_coordinates(centroid)[,2]) %>% 
  as.data.frame()
## Warning: Problem with `mutate()` input `centroid`.
## x st_centroid does not give correct centroids for longitude/latitude data
## i Input `centroid` is `st_centroid(geometry)`.
## Warning in st_centroid.sfc(geometry): st_centroid does not give correct
## centroids for longitude/latitude data
ica_df <- ica_df %>% select(Province, Districts, x, y)
ica_df <- ica_df %>% summarize(Province = tolower(Province), Districts = tolower(Districts), x = x, y = y)

Child and ProvDist data combination

child_dname <- child %>% left_join(provdist[-1])
## Joining, by = "DID"
child_dname <- child_dname %>% mutate(dname = tolower(DNAME))
ica_df_3 <- ica_df %>% filter(Province == "sindh")

ica_df_3$Districts <- ica_df_3$Districts %>%  
  str_replace("ghotki", "gotki") %>%
  str_replace("mirpur khas", "mirpurkhas") %>% 
  str_replace("malir karachi", "karachi-malir-rural") %>% 
  str_replace("naushahro feroze", "nowshero feroze") %>% 
  str_replace("kambar shahdad kot", "qambar shahdadkot") %>% 
  str_replace("sujawal", "sajawal") %>% 
  str_replace("shaheed benazir abad", "shaheed benazirabad") %>% 
  str_replace("tando allahyar", "tando allah yar") %>% 
  as.vector()

child_dname_3 <- child_dname %>% filter(RNAME == "Sindh") %>% left_join(ica_df_3, by = c("dname" = "Districts"))

child_dname_3 %>% group_by(dname) %>% summarize(n = sum(x))
## `summarise()` ungrouping output (override with `.groups` argument)
ica_df_3

Preparation for Logistic Regression Analysis

Making Dataframe With Dummy Variables

child_ica_dummy <- child_ica %>% filter(!is.na(C002), !is.na(C003), !is.na(PR004), !is.na(PR009))
child_ica_dummy$C002_01 <- ifelse(child_ica_dummy$C002 == -1, 1, 0)
child_ica_dummy$C003_01 <- ifelse(child_ica_dummy$C003 == 3, 1, 0)     # currently-enrolled
child_ica_dummy$C003_1_01 <- ifelse(child_ica_dummy$C003 == 1, 1, 0)   # never-enrolled
child_ica_dummy$C003_2_01 <- ifelse(child_ica_dummy$C003 == 2, 1, 0)   # drop-out
child_ica_dummy$PR004_01 <- ifelse(child_ica_dummy$PR004 == -1, 1, 0)
child_ica_dummy$PR009_01 <- ifelse(child_ica_dummy$PR009 == -1, 1, 0)
child_ica_dummy$PR004_PR009_01 <- ifelse(
  child_ica_dummy$PR004 == -1 | child_ica_dummy$PR009 == -1, 1, 0)


child_ica_dummy$PR004_only_01 <- ifelse(
  child_ica_dummy$PR004 == -1 & child_ica_dummy$PR009 == 0, 1, 0)
child_ica_dummy$PR009_only_01 <- ifelse(
  child_ica_dummy$PR004 == 0 & child_ica_dummy$PR009 == -1, 1, 0)
child_ica_dummy$PR004_PR009_both_01 <- ifelse(
  child_ica_dummy$PR004 == -1 & child_ica_dummy$PR009 == -1, 1, 0)
n_children_in_household <- child_ica_dummy %>% 
  group_by(HHID) %>% 
  summarize(n_children_in_household = length(unique(CID)))
## `summarise()` ungrouping output (override with `.groups` argument)
child_ica_dummy <- child_ica_dummy %>% left_join(n_children_in_household)
## Joining, by = "HHID"
child_ica_dummy$H002_1_01 <- ifelse(child_ica_dummy$H002 == 1, 1, 0)
child_ica_dummy$H002_2_01 <- ifelse(child_ica_dummy$H002 == 2, 1, 0)
child_ica_dummy$H002_3_01 <- ifelse(child_ica_dummy$H002 == 3, 1, 0)

Regional Dummy Variables

child_ica_dummy <- child_ica_dummy
child_ica_dummy$Panjab <- ifelse(child_ica_dummy$RID == 2, 1, 0)
child_ica_dummy$Sindh <- ifelse(child_ica_dummy$RID == 3, 1, 0)
child_ica_dummy$Balochistan <- ifelse(child_ica_dummy$RID == 4, 1, 0)
child_ica_dummy$Khyber_Pakhtunkhwa <- ifelse(child_ica_dummy$RID == 5, 1, 0)
child_ica_dummy$Gilgit_Baltistan <- ifelse(child_ica_dummy$RID == 6, 1, 0)
child_ica_dummy$Azad_Jammu_and_Kashmir <- ifelse(child_ica_dummy$RID == 7, 1, 0)
child_ica_dummy$Islamabad_ICT <- ifelse(child_ica_dummy$RID == 8, 1, 0)
child_ica_dummy$Federally_Administrated_Tribal_Areas <- ifelse(child_ica_dummy$RID == 9, 1, 0)
dists_near_hunza_1 <- c(178, 199, 243, 259, 265, 266, 267, 270, 273, 274)
child_ica_dummy$DID_near_hunza_1 <- ifelse(child_ica_dummy$DID %in% dists_near_hunza_1, 1, 0)
dists_near_hunza_2 <- c(176, 178, 185, 199, 243, 245, 259, 265, 267, 270, 273, 274)
child_ica_dummy$DID_near_hunza_2 <- ifelse(child_ica_dummy$DID %in% dists_near_hunza_2, 1, 0)

Factor Variables

child_ica_dummy$DID <- child_ica_dummy$DID %>% as.factor()

Notes on Data Wrangling

Eliminated NAs

child_ica %>% 
  summarize(C002_na = sum(is.na(C002)),
            C003_na = sum(is.na(C003)),
            PR004_na = sum(is.na(PR004)),
            PR009_na = sum(is.na(PR009)))

Eliminated Rows in Total

data.frame(original_rows = nrow(child_ica),
           eliminated_rows = nrow(child_ica) - nrow(child_ica_dummy),
           ratio = (nrow(child_ica)-nrow(child_ica_dummy))/nrow(child_ica))

Eliminated NAs Hunza

child_ica %>% 
  filter(DID == 266) %>% 
  summarize(C002_na = sum(is.na(C002)),
            C003_na = sum(is.na(C003)),
            PR004_na = sum(is.na(PR004)),
            PR009_na = sum(is.na(PR009)))

Eliminated Rows in Total Hunza

data.frame(original_rows = nrow(child_ica %>% filter(DID == 266)),
           eliminated_rows = nrow(child_ica %>% filter(DID == 266)) - nrow(child_ica_dummy %>% filter(DID == 266)),
           ratio = (nrow(child_ica %>% filter(DID == 266) %>% filter(DID == 266))-nrow(child_ica_dummy %>% filter(DID == 266)))/nrow(child_ica %>% filter(DID == 266)))

Generalized Linear Models

Hunza. GLM Age >= 5 C003_01 ~ n_children_in_household + PR004_PR009_both_01

glm_child <- glm(C003_01 ~ n_children_in_household + PR004_PR009_both_01, family = "binomial", data = child_ica_dummy %>% filter(C001 >= 5, DID == 266))

ggPredict(glm_child, se = TRUE, colorAsFactor = TRUE, show.summary = TRUE, point = TRUE)
## 
## Call:
## glm(formula = C003_01 ~ n_children_in_household + PR004_PR009_both_01, 
##     family = "binomial", data = child_ica_dummy %>% filter(C001 >= 
##         5, DID == 266))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2554   0.1395   0.1646   0.2860   0.5418  
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               4.1748     0.5255   7.944 1.96e-15 ***
## n_children_in_household  -0.3329     0.1099  -3.028 0.002461 ** 
## PR004_PR009_both_01       1.4519     0.3926   3.698 0.000217 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 384.83  on 1409  degrees of freedom
## Residual deviance: 349.37  on 1407  degrees of freedom
## AIC: 355.37
## 
## Number of Fisher Scoring iterations: 7
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

Hosmer-Lemeshow
hoslem.test(x = glm_child$y, y = fitted(glm_child))
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  glm_child$y, fitted(glm_child)
## X-squared = 5.41, df = 8, p-value = 0.713
AIC
extractAIC(glm_child)
## [1]   3.0000 355.3652
BIC
extractAIC(glm_child, k = log(nrow(glm_child$data)))
## [1]   3.0000 371.1192
effectiveness of explanatory variables
glm_child_null <- glm(C003_01~1, family = "binomial", data = child_ica_dummy %>% filter(C001 >= 5, DID == 266))
anova(glm_child_null, glm_child, test = "Chisq")
variables selection
step(glm_child_null, direction = "both", 
     scope = (~ C001 + C002_01 + PR004_PR009_both_01 + n_children_in_household))
## Start:  AIC=386.83
## C003_01 ~ 1
## 
##                           Df Deviance    AIC
## + PR004_PR009_both_01      1   358.59 362.59
## + n_children_in_household  1   365.61 369.61
## <none>                         384.83 386.83
## + C001                     1   384.22 388.22
## + C002_01                  1   384.61 388.61
## 
## Step:  AIC=362.59
## C003_01 ~ PR004_PR009_both_01
## 
##                           Df Deviance    AIC
## + n_children_in_household  1   349.37 355.37
## <none>                         358.59 362.59
## + C002_01                  1   358.16 364.16
## + C001                     1   358.55 364.55
## - PR004_PR009_both_01      1   384.83 386.83
## 
## Step:  AIC=355.37
## C003_01 ~ PR004_PR009_both_01 + n_children_in_household
## 
##                           Df Deviance    AIC
## <none>                         349.37 355.37
## + C002_01                  1   348.46 356.46
## + C001                     1   349.34 357.34
## - n_children_in_household  1   358.59 362.59
## - PR004_PR009_both_01      1   365.61 369.61
## 
## Call:  glm(formula = C003_01 ~ PR004_PR009_both_01 + n_children_in_household, 
##     family = "binomial", data = child_ica_dummy %>% filter(C001 >= 
##         5, DID == 266))
## 
## Coefficients:
##             (Intercept)      PR004_PR009_both_01  n_children_in_household  
##                  4.1748                   1.4519                  -0.3329  
## 
## Degrees of Freedom: 1409 Total (i.e. Null);  1407 Residual
## Null Deviance:       384.8 
## Residual Deviance: 349.4     AIC: 355.4
multicolinearity
vif(glm_child)
## n_children_in_household     PR004_PR009_both_01 
##                1.068705                1.068705

Whole Pak. GLM Age >= 5 C003_01 ~ n_children_in_household + PR004_PR009_both_01 + relevel(DID, ref = “266”)

glm_child <- glm(C003_01 ~ n_children_in_household + PR004_PR009_both_01 + relevel(DID, ref = "266"), family = "binomial", data = child_ica_dummy %>% filter(C001 >= 5))

glm_child %>% summary()
## 
## Call:
## glm(formula = C003_01 ~ n_children_in_household + PR004_PR009_both_01 + 
##     relevel(DID, ref = "266"), family = "binomial", data = child_ica_dummy %>% 
##     filter(C001 >= 5))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2002   0.1836   0.4668   0.6971   1.5731  
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   3.509591   0.156181  22.471  < 2e-16 ***
## n_children_in_household      -0.107577   0.003969 -27.104  < 2e-16 ***
## PR004_PR009_both_01           0.673143   0.017386  38.717  < 2e-16 ***
## relevel(DID, ref = "266")146 -0.875895   0.194633  -4.500 6.79e-06 ***
## relevel(DID, ref = "266")147 -1.482768   0.172762  -8.583  < 2e-16 ***
## relevel(DID, ref = "266")148 -0.915625   0.179277  -5.107 3.27e-07 ***
## relevel(DID, ref = "266")149 -1.643550   0.168400  -9.760  < 2e-16 ***
## relevel(DID, ref = "266")150 -1.607819   0.168098  -9.565  < 2e-16 ***
## relevel(DID, ref = "266")151 -0.807095   0.185542  -4.350 1.36e-05 ***
## relevel(DID, ref = "266")152 -0.717054   0.196612  -3.647 0.000265 ***
## relevel(DID, ref = "266")153 -1.736759   0.170212 -10.203  < 2e-16 ***
## relevel(DID, ref = "266")154 -1.553572   0.172629  -8.999  < 2e-16 ***
## relevel(DID, ref = "266")155 -1.670105   0.167193  -9.989  < 2e-16 ***
## relevel(DID, ref = "266")156 -2.140333   0.181540 -11.790  < 2e-16 ***
## relevel(DID, ref = "266")157 -1.103420   0.181688  -6.073 1.25e-09 ***
## relevel(DID, ref = "266")158 -1.156450   0.177290  -6.523 6.89e-11 ***
## relevel(DID, ref = "266")159 -2.184450   0.172128 -12.691  < 2e-16 ***
## relevel(DID, ref = "266")160 -1.709068   0.169995 -10.054  < 2e-16 ***
## relevel(DID, ref = "266")161 -2.304469   0.163081 -14.131  < 2e-16 ***
## relevel(DID, ref = "266")162 -0.630815   0.203630  -3.098 0.001949 ** 
## relevel(DID, ref = "266")163 -1.012210   0.196169  -5.160 2.47e-07 ***
## relevel(DID, ref = "266")164 -1.659793   0.180095  -9.216  < 2e-16 ***
## relevel(DID, ref = "266")165 -1.903771   0.169885 -11.206  < 2e-16 ***
## relevel(DID, ref = "266")166 -1.684691   0.171039  -9.850  < 2e-16 ***
## relevel(DID, ref = "266")167 -0.396569   0.202148  -1.962 0.049789 *  
## relevel(DID, ref = "266")169 -2.712608   0.163731 -16.568  < 2e-16 ***
## relevel(DID, ref = "266")170 -1.222926   0.177931  -6.873 6.28e-12 ***
## relevel(DID, ref = "266")171 -1.205940   0.180436  -6.683 2.33e-11 ***
## relevel(DID, ref = "266")172 -1.385540   0.175339  -7.902 2.74e-15 ***
## relevel(DID, ref = "266")173 -1.117779   0.177247  -6.306 2.86e-10 ***
## relevel(DID, ref = "266")174 -1.723520   0.172349 -10.000  < 2e-16 ***
## relevel(DID, ref = "266")175 -2.293485   0.166458 -13.778  < 2e-16 ***
## relevel(DID, ref = "266")176 -0.389622   0.228612  -1.704 0.088327 .  
## relevel(DID, ref = "266")177 -1.775120   0.168013 -10.565  < 2e-16 ***
## relevel(DID, ref = "266")178 -0.088396   0.210848  -0.419 0.675040    
## relevel(DID, ref = "266")179 -1.150526   0.179781  -6.400 1.56e-10 ***
## relevel(DID, ref = "266")180 -1.451699   0.173220  -8.381  < 2e-16 ***
## relevel(DID, ref = "266")181 -1.148265   0.182264  -6.300 2.98e-10 ***
## relevel(DID, ref = "266")182 -2.301266   0.165160 -13.934  < 2e-16 ***
## relevel(DID, ref = "266")183 -1.559035   0.172473  -9.039  < 2e-16 ***
## relevel(DID, ref = "266")184 -2.161783   0.165073 -13.096  < 2e-16 ***
## relevel(DID, ref = "266")185 -0.255138   0.189408  -1.347 0.177970    
## relevel(DID, ref = "266")186 -2.386320   0.164891 -14.472  < 2e-16 ***
## relevel(DID, ref = "266")187 -2.162162   0.166537 -12.983  < 2e-16 ***
## relevel(DID, ref = "266")188 -1.932412   0.167238 -11.555  < 2e-16 ***
## relevel(DID, ref = "266")189 -1.114197   0.175976  -6.332 2.43e-10 ***
## relevel(DID, ref = "266")190 -1.830149   0.174173 -10.508  < 2e-16 ***
## relevel(DID, ref = "266")191 -1.648106   0.174426  -9.449  < 2e-16 ***
## relevel(DID, ref = "266")192 -1.337355   0.176637  -7.571 3.70e-14 ***
## relevel(DID, ref = "266")193 -0.664444   0.179964  -3.692 0.000222 ***
## relevel(DID, ref = "266")194 -2.631858   0.164251 -16.023  < 2e-16 ***
## relevel(DID, ref = "266")195 -2.789213   0.165049 -16.899  < 2e-16 ***
## relevel(DID, ref = "266")196 -1.025130   0.192010  -5.339 9.35e-08 ***
## relevel(DID, ref = "266")197 -2.305263   0.165346 -13.942  < 2e-16 ***
## relevel(DID, ref = "266")198 -2.639440   0.164133 -16.081  < 2e-16 ***
## relevel(DID, ref = "266")199  0.401975   0.214627   1.873 0.061083 .  
## relevel(DID, ref = "266")200 -2.379658   0.167313 -14.223  < 2e-16 ***
## relevel(DID, ref = "266")202 -2.257160   0.166358 -13.568  < 2e-16 ***
## relevel(DID, ref = "266")203 -2.441492   0.163484 -14.934  < 2e-16 ***
## relevel(DID, ref = "266")204 -1.682783   0.169939  -9.902  < 2e-16 ***
## relevel(DID, ref = "266")205 -2.500618   0.163197 -15.323  < 2e-16 ***
## relevel(DID, ref = "266")206 -2.476645   0.163259 -15.170  < 2e-16 ***
## relevel(DID, ref = "266")207 -3.060126   0.163533 -18.713  < 2e-16 ***
## relevel(DID, ref = "266")208 -2.343127   0.164555 -14.239  < 2e-16 ***
## relevel(DID, ref = "266")209 -1.662136   0.175557  -9.468  < 2e-16 ***
## relevel(DID, ref = "266")210 -2.666322   0.162637 -16.394  < 2e-16 ***
## relevel(DID, ref = "266")211 -1.806394   0.173698 -10.400  < 2e-16 ***
## relevel(DID, ref = "266")212 -2.424800   0.165135 -14.684  < 2e-16 ***
## relevel(DID, ref = "266")213 -2.517405   0.163698 -15.378  < 2e-16 ***
## relevel(DID, ref = "266")214 -2.856169   0.163880 -17.428  < 2e-16 ***
## relevel(DID, ref = "266")215 -1.894926   0.163571 -11.585  < 2e-16 ***
## relevel(DID, ref = "266")216 -2.667465   0.165232 -16.144  < 2e-16 ***
## relevel(DID, ref = "266")217 -2.621078   0.164739 -15.910  < 2e-16 ***
## relevel(DID, ref = "266")218 -2.373729   0.162463 -14.611  < 2e-16 ***
## relevel(DID, ref = "266")219 -2.573537   0.165147 -15.583  < 2e-16 ***
## relevel(DID, ref = "266")220 -2.929650   0.161316 -18.161  < 2e-16 ***
## relevel(DID, ref = "266")221 -3.388760   0.164526 -20.597  < 2e-16 ***
## relevel(DID, ref = "266")222 -2.641607   0.162435 -16.263  < 2e-16 ***
## relevel(DID, ref = "266")223 -2.335626   0.171366 -13.629  < 2e-16 ***
## relevel(DID, ref = "266")224 -2.947502   0.163771 -17.998  < 2e-16 ***
## relevel(DID, ref = "266")225 -2.765260   0.164207 -16.840  < 2e-16 ***
## relevel(DID, ref = "266")226 -2.445937   0.166589 -14.682  < 2e-16 ***
## relevel(DID, ref = "266")227 -2.505052   0.163056 -15.363  < 2e-16 ***
## relevel(DID, ref = "266")228 -3.436061   0.165374 -20.778  < 2e-16 ***
## relevel(DID, ref = "266")229 -2.788615   0.166051 -16.794  < 2e-16 ***
## relevel(DID, ref = "266")230 -2.951117   0.163918 -18.004  < 2e-16 ***
## relevel(DID, ref = "266")231 -2.540898   0.163221 -15.567  < 2e-16 ***
## relevel(DID, ref = "266")232 -1.303620   0.172806  -7.544 4.56e-14 ***
## relevel(DID, ref = "266")233 -3.337002   0.166821 -20.004  < 2e-16 ***
## relevel(DID, ref = "266")234 -2.488387   0.169736 -14.660  < 2e-16 ***
## relevel(DID, ref = "266")235 -1.659018   0.168694  -9.834  < 2e-16 ***
## relevel(DID, ref = "266")236 -1.870621   0.169750 -11.020  < 2e-16 ***
## relevel(DID, ref = "266")237 -1.475577   0.173747  -8.493  < 2e-16 ***
## relevel(DID, ref = "266")238  1.232565   0.295481   4.171 3.03e-05 ***
## relevel(DID, ref = "266")239 -1.145610   0.177968  -6.437 1.22e-10 ***
## relevel(DID, ref = "266")240 -1.659761   0.170571  -9.731  < 2e-16 ***
## relevel(DID, ref = "266")241 -2.841397   0.164413 -17.282  < 2e-16 ***
## relevel(DID, ref = "266")242 -1.867780   0.170169 -10.976  < 2e-16 ***
## relevel(DID, ref = "266")243  0.320346   0.303443   1.056 0.291104    
## relevel(DID, ref = "266")244 -1.201486   0.181179  -6.632 3.32e-11 ***
## relevel(DID, ref = "266")245 -0.297386   0.202708  -1.467 0.142358    
## relevel(DID, ref = "266")246 -2.358940   0.166469 -14.170  < 2e-16 ***
## relevel(DID, ref = "266")247 -2.172820   0.171802 -12.647  < 2e-16 ***
## relevel(DID, ref = "266")248 -1.678299   0.165842 -10.120  < 2e-16 ***
## relevel(DID, ref = "266")249 -0.647963   0.196336  -3.300 0.000966 ***
## relevel(DID, ref = "266")250 -1.467809   0.170954  -8.586  < 2e-16 ***
## relevel(DID, ref = "266")251 -1.356769   0.175621  -7.726 1.11e-14 ***
## relevel(DID, ref = "266")252 -1.572819   0.170335  -9.234  < 2e-16 ***
## relevel(DID, ref = "266")253 -1.325726   0.178398  -7.431 1.08e-13 ***
## relevel(DID, ref = "266")254 -2.523487   0.180001 -14.019  < 2e-16 ***
## relevel(DID, ref = "266")255 -1.601880   0.175767  -9.114  < 2e-16 ***
## relevel(DID, ref = "266")256 -0.530739   0.181964  -2.917 0.003537 ** 
## relevel(DID, ref = "266")257 -0.491253   0.215678  -2.278 0.022743 *  
## relevel(DID, ref = "266")258 -2.397226   0.164104 -14.608  < 2e-16 ***
## relevel(DID, ref = "266")259 -0.120495   0.218934  -0.550 0.582065    
## relevel(DID, ref = "266")260 -0.579001   0.183002  -3.164 0.001557 ** 
## relevel(DID, ref = "266")261 -3.072381   0.162677 -18.886  < 2e-16 ***
## relevel(DID, ref = "266")262 -1.110709   0.174535  -6.364 1.97e-10 ***
## relevel(DID, ref = "266")263 -0.618894   0.185937  -3.329 0.000873 ***
## relevel(DID, ref = "266")264 -0.817633   0.179256  -4.561 5.09e-06 ***
## relevel(DID, ref = "266")265 -0.190128   0.204767  -0.929 0.353142    
## relevel(DID, ref = "266")267 -0.203894   0.200998  -1.014 0.310388    
## relevel(DID, ref = "266")268 -1.662288   0.179179  -9.277  < 2e-16 ***
## relevel(DID, ref = "266")269  0.958226   0.302515   3.168 0.001537 ** 
## relevel(DID, ref = "266")270 -0.206918   0.198848  -1.041 0.298067    
## relevel(DID, ref = "266")271  0.511318   0.257880   1.983 0.047393 *  
## relevel(DID, ref = "266")272  0.933463   0.295507   3.159 0.001584 ** 
## relevel(DID, ref = "266")273 -0.224661   0.210888  -1.065 0.286736    
## relevel(DID, ref = "266")274  0.181126   0.269851   0.671 0.502089    
## relevel(DID, ref = "266")275 -1.877979   0.170434 -11.019  < 2e-16 ***
## relevel(DID, ref = "266")276  1.254633   0.295322   4.248 2.15e-05 ***
## relevel(DID, ref = "266")277 -1.036492   0.226583  -4.574 4.77e-06 ***
## relevel(DID, ref = "266")278 -1.035538   0.167092  -6.197 5.74e-10 ***
## relevel(DID, ref = "266")279 -2.582915   0.164997 -15.654  < 2e-16 ***
## relevel(DID, ref = "266")280 -1.305004   0.172797  -7.552 4.28e-14 ***
## relevel(DID, ref = "266")281 -0.702178   0.180060  -3.900 9.63e-05 ***
## relevel(DID, ref = "266")282 -1.272273   0.181308  -7.017 2.26e-12 ***
## relevel(DID, ref = "266")284 -1.538773   0.170887  -9.005  < 2e-16 ***
## relevel(DID, ref = "266")287 -1.485674   0.175398  -8.470  < 2e-16 ***
## relevel(DID, ref = "266")289 -1.862308   0.166027 -11.217  < 2e-16 ***
## relevel(DID, ref = "266")290 -1.680815   0.170166  -9.878  < 2e-16 ***
## relevel(DID, ref = "266")315 -1.335461   0.190690  -7.003 2.50e-12 ***
## relevel(DID, ref = "266")316 -2.145252   0.167764 -12.787  < 2e-16 ***
## relevel(DID, ref = "266")318 -2.754422   0.168236 -16.372  < 2e-16 ***
## relevel(DID, ref = "266")319 -3.219120   0.163022 -19.747  < 2e-16 ***
## relevel(DID, ref = "266")320 -2.641120   0.168331 -15.690  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 212206  on 208249  degrees of freedom
## Residual deviance: 183508  on 208104  degrees of freedom
## AIC: 183800
## 
## Number of Fisher Scoring iterations: 7
glm_child %>% 
  summary() %>% 
  .$coefficient %>% 
  as.data.frame() %>% 
  select(`Pr(>|z|)`) %>% 
  filter(`Pr(>|z|)` > 0.05) %>% 
  row.names() %>% 
  str_split("\\)") %>% 
  as.data.frame() %>% 
  gather() %>% 
  select(value) %>% 
  .[seq(2,24,2),] %>% 
  as.numeric()
##  [1] 176 178 185 199 243 245 259 265 267 270 273 274

dists_near_hunza_2. GLM Age >= 5 C003_01 ~ n_children_in_household + PR004_PR009_both_01

glm_child <- glm(C003_01 ~ n_children_in_household + PR004_PR009_both_01, family = "binomial", data = child_ica_dummy %>% filter(C001 >= 5, DID %in% dists_near_hunza_2))

glm_child %>% summary()
## 
## Call:
## glm(formula = C003_01 ~ n_children_in_household + PR004_PR009_both_01, 
##     family = "binomial", data = child_ica_dummy %>% filter(C001 >= 
##         5, DID %in% dists_near_hunza_2))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8439   0.2201   0.2575   0.2913   0.4617  
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              3.61568    0.12770  28.314  < 2e-16 ***
## n_children_in_household -0.15896    0.03082  -5.158 2.49e-07 ***
## PR004_PR009_both_01      0.56938    0.08634   6.595 4.25e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5235.5  on 16790  degrees of freedom
## Residual deviance: 5161.7  on 16788  degrees of freedom
## AIC: 5167.7
## 
## Number of Fisher Scoring iterations: 6
Hosmer-Lemeshow
hoslem.test(x = glm_child$y, y = fitted(glm_child))
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  glm_child$y, fitted(glm_child)
## X-squared = 12.573, df = 8, p-value = 0.1274

dists_near_hunza_2 or Not. GLM Age >= 5 C003_01 ~ n_children_in_household + PR004_PR009_both_01 + DID_near_hunza_2

glm_child <- glm(C003_01 ~ n_children_in_household + PR004_PR009_both_01 + DID_near_hunza_2, family = "binomial", data = child_ica_dummy %>% filter(C001 >= 5))

glm_child %>% summary()
## 
## Call:
## glm(formula = C003_01 ~ n_children_in_household + PR004_PR009_both_01 + 
##     DID_near_hunza_2, family = "binomial", data = child_ica_dummy %>% 
##     filter(C001 >= 5))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.9326   0.3055   0.5206   0.7875   1.1187  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              1.448166   0.015604   92.81   <2e-16 ***
## n_children_in_household -0.109043   0.003552  -30.70   <2e-16 ***
## PR004_PR009_both_01      1.136312   0.015705   72.36   <2e-16 ***
## DID_near_hunza_2         1.811042   0.041876   43.25   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 212206  on 208249  degrees of freedom
## Residual deviance: 199881  on 208246  degrees of freedom
## AIC: 199889
## 
## Number of Fisher Scoring iterations: 5
Hosmer-Lemeshow
hoslem.test(x = glm_child$y, y = fitted(glm_child))
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  glm_child$y, fitted(glm_child)
## X-squared = 33.224, df = 8, p-value = 5.611e-05

Not dists_near_hunza. GLM Age >= 5 C003_01 ~ n_children_in_household + PR004_PR009_both_01 + as.factor(H004)

glm_child <- glm(C003_01 ~ n_children_in_household + C002_01 + PR004_PR009_both_01 + as.factor(H004), family = "binomial", data = child_ica_dummy %>% filter(C001 >= 5, !DID %in% dists_near_hunza_2))

glm_child %>% summary()
## 
## Call:
## glm(formula = C003_01 ~ n_children_in_household + C002_01 + PR004_PR009_both_01 + 
##     as.factor(H004), family = "binomial", data = child_ica_dummy %>% 
##     filter(C001 >= 5, !DID %in% dists_near_hunza_2))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4357   0.3420   0.5790   0.7647   1.4179  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              1.908080   0.017456  109.31   <2e-16 ***
## n_children_in_household -0.103530   0.003677  -28.16   <2e-16 ***
## C002_01                 -0.779046   0.011555  -67.42   <2e-16 ***
## PR004_PR009_both_01      1.108833   0.016443   67.44   <2e-16 ***
## as.factor(H004)0        -0.643349   0.014718  -43.71   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 200070  on 189127  degrees of freedom
## Residual deviance: 185935  on 189123  degrees of freedom
##   (2331 observations deleted due to missingness)
## AIC: 185945
## 
## Number of Fisher Scoring iterations: 4
Hosmer-Lemeshow
hoslem.test(x = glm_child$y, y = fitted(glm_child))
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  glm_child$y, fitted(glm_child)
## X-squared = 156.66, df = 8, p-value < 2.2e-16
multicolinearity
vif(glm_child)
## n_children_in_household                 C002_01     PR004_PR009_both_01 
##                1.008000                1.008143                1.033210 
##         as.factor(H004) 
##                1.024592
g_n <- gschool %>% 
  group_by(DID) %>% 
  summarize(g_n = sum(unique(GSID)))
## `summarise()` ungrouping output (override with `.groups` argument)
p_n <- pschool %>% 
  group_by(DID) %>% 
  summarize(p_n = sum(unique(PSID)))
## `summarise()` ungrouping output (override with `.groups` argument)
school_type <- g_n %>% full_join(p_n) %>% 
  mutate(ratio = p_n/(g_n + p_n))
## Joining, by = "DID"
school_type$DID <- as.factor(school_type$DID)
child_ica_dummy %>% 
  full_join(school_type)
## Joining, by = "DID"

Not dists_near_hunza. GLM Age >= 5 ratio + C002_01 + PR004_PR009_both_01

glm_child <- glm(C003_01 ~ ratio + C002_01 + PR004_PR009_both_01, family = "binomial", data = child_ica_dummy %>% full_join(school_type) %>% filter(C001 >= 5, !DID %in% dists_near_hunza_2))
## Joining, by = "DID"
## Joining, by = "DID"
ggPredict(glm_child, se = TRUE, colorAsFactor = TRUE, show.summary = TRUE, point = TRUE)
## 
## Call:
## glm(formula = C003_01 ~ ratio + C002_01 + PR004_PR009_both_01, 
##     family = "binomial", data = child_ica_dummy %>% full_join(school_type) %>% 
##         filter(C001 >= 5, !DID %in% dists_near_hunza_2))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6413   0.3124   0.5318   0.7304   1.0411  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          1.06540    0.01338   79.65   <2e-16 ***
## ratio                2.04959    0.04468   45.87   <2e-16 ***
## C002_01             -0.75283    0.01401  -53.75   <2e-16 ***
## PR004_PR009_both_01  1.11872    0.01922   58.20   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 140074  on 143360  degrees of freedom
## Residual deviance: 129578  on 143357  degrees of freedom
##   (48098 observations deleted due to missingness)
## AIC: 129586
## 
## Number of Fisher Scoring iterations: 5
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

# glm_child %>% summary()
Hosmer-Lemeshow
hoslem.test(x = glm_child$y, y = fitted(glm_child))
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  glm_child$y, fitted(glm_child)
## X-squared = 602.05, df = 8, p-value < 2.2e-16

Not dists_near_hunza. GLM Age >= 5 n_children_in_household + ratio + C002_01 + PR004_PR009_both_01

glm_child <- glm(C003_01 ~ n_children_in_household + ratio + C002_01 + PR004_PR009_both_01, family = "binomial", data = child_ica_dummy %>% full_join(school_type) %>% filter(C001 >= 5, !DID %in% dists_near_hunza_2))
## Joining, by = "DID"
## Joining, by = "DID"
ggPredict(glm_child, se = TRUE, colorAsFactor = TRUE, show.summary = TRUE, point = TRUE)
## 
## Call:
## glm(formula = C003_01 ~ n_children_in_household + ratio + C002_01 + 
##     PR004_PR009_both_01, family = "binomial", data = child_ica_dummy %>% 
##     full_join(school_type) %>% filter(C001 >= 5, !DID %in% dists_near_hunza_2))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7354   0.3087   0.5204   0.7147   1.3120  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              1.479115   0.022225   66.55   <2e-16 ***
## n_children_in_household -0.103089   0.004352  -23.69   <2e-16 ***
## ratio                    2.033637   0.044722   45.47   <2e-16 ***
## C002_01                 -0.743131   0.014044  -52.91   <2e-16 ***
## PR004_PR009_both_01      1.077862   0.019318   55.80   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 140074  on 143360  degrees of freedom
## Residual deviance: 129023  on 143356  degrees of freedom
##   (48098 observations deleted due to missingness)
## AIC: 129033
## 
## Number of Fisher Scoring iterations: 5
## Warning in ggPredict(glm_child, se = TRUE, colorAsFactor = TRUE, show.summary =
## TRUE, : maximum three independent variables are allowed
## NULL
# glm_child %>% summary()
Hosmer-Lemeshow
hoslem.test(x = glm_child$y, y = fitted(glm_child)) 
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  glm_child$y, fitted(glm_child)
## X-squared = 254.37, df = 8, p-value < 2.2e-16

Each Dist. GLM Age >= 5 C003_01 ~ n_children_in_household + PR004_PR009_both_01

each_dist <- sapply(as.numeric(as.character(unique(child_ica_dummy$DID))), function(id){
  child_ica_dummy <- child_ica_dummy %>% filter(DID == id)
  glm_child <- glm(C003_01 ~ n_children_in_household + PR004_PR009_both_01, 
                   family = "binomial", data = child_ica_dummy %>% filter(C001 >= 5))
  temp_hoslem <- hoslem.test(x = glm_child$y, y = fitted(glm_child))
  data.frame(DID = id, hoslem_p_value = temp_hoslem$p.value)
}) %>% t()
each_dist
##        DID hoslem_p_value
##   [1,] 146 0.4833365     
##   [2,] 147 0.9696368     
##   [3,] 148 0.2340054     
##   [4,] 149 0.263045      
##   [5,] 150 0.4052944     
##   [6,] 151 0.6107987     
##   [7,] 152 0.5295676     
##   [8,] 153 0.2108079     
##   [9,] 154 0.8521452     
##  [10,] 155 0.9999996     
##  [11,] 156 0.4243112     
##  [12,] 157 0.1996798     
##  [13,] 158 0.269862      
##  [14,] 159 0.957696      
##  [15,] 160 0.1873412     
##  [16,] 161 0.09862571    
##  [17,] 162 0.970035      
##  [18,] 163 0.07888207    
##  [19,] 164 0.8745456     
##  [20,] 165 0.009088681   
##  [21,] 166 0.6417484     
##  [22,] 167 0.8832939     
##  [23,] 169 0.9760668     
##  [24,] 170 0.3695271     
##  [25,] 171 0.1588784     
##  [26,] 172 0.07933903    
##  [27,] 173 0.009982887   
##  [28,] 174 0.1314211     
##  [29,] 175 0.9932377     
##  [30,] 176 0.9237029     
##  [31,] 177 0.5996293     
##  [32,] 178 0.6349025     
##  [33,] 179 0.2552938     
##  [34,] 180 0.5420001     
##  [35,] 181 0.7728695     
##  [36,] 182 0.9210615     
##  [37,] 183 0.00772995    
##  [38,] 184 0.6966427     
##  [39,] 185 0.1115057     
##  [40,] 186 0.6361407     
##  [41,] 187 NaN           
##  [42,] 188 0.6543812     
##  [43,] 189 0.5089121     
##  [44,] 190 0.4263431     
##  [45,] 191 0.9611336     
##  [46,] 192 0.2009149     
##  [47,] 193 6.445687e-06  
##  [48,] 194 0.615997      
##  [49,] 195 0.2881515     
##  [50,] 196 0.003757288   
##  [51,] 197 0.959232      
##  [52,] 198 0.3930063     
##  [53,] 199 0.3020499     
##  [54,] 200 0.2915802     
##  [55,] 202 0.8220956     
##  [56,] 203 3.982822e-07  
##  [57,] 204 0.7540585     
##  [58,] 315 0.4577361     
##  [59,] 316 0.01377099    
##  [60,] 320 0.03400093    
##  [61,] 205 0.9858706     
##  [62,] 206 0.9781201     
##  [63,] 207 0.9324696     
##  [64,] 208 0.9349555     
##  [65,] 209 0.8148914     
##  [66,] 210 0.1928247     
##  [67,] 211 0.9737445     
##  [68,] 212 0.9604219     
##  [69,] 213 0.8909924     
##  [70,] 214 0.8231951     
##  [71,] 215 0.6468684     
##  [72,] 216 0.927478      
##  [73,] 217 0.8091547     
##  [74,] 218 0.6925247     
##  [75,] 219 0.851332      
##  [76,] 220 0.1065343     
##  [77,] 221 0.9281304     
##  [78,] 222 0.2759677     
##  [79,] 223 0.8395059     
##  [80,] 224 0.8469389     
##  [81,] 225 0.9953924     
##  [82,] 226 0.1391444     
##  [83,] 227 0.4715612     
##  [84,] 228 0.951791      
##  [85,] 229 1             
##  [86,] 230 0.9506553     
##  [87,] 231 0.999271      
##  [88,] 232 0.002364582   
##  [89,] 233 0.8167503     
##  [90,] 234 0.2029032     
##  [91,] 318 0.8704222     
##  [92,] 319 0.2196002     
##  [93,] 235 0.2113758     
##  [94,] 236 0.02945804    
##  [95,] 237 0.2517542     
##  [96,] 238 0.6276721     
##  [97,] 239 0.2157016     
##  [98,] 240 0.1282194     
##  [99,] 241 5.197638e-05  
## [100,] 242 0.6327375     
## [101,] 243 0.9503456     
## [102,] 244 0.2637431     
## [103,] 245 0.9922232     
## [104,] 246 0.005520648   
## [105,] 247 0.2720873     
## [106,] 248 0.008590541   
## [107,] 249 0.4451278     
## [108,] 250 0.3860093     
## [109,] 251 0.9431389     
## [110,] 252 0.363282      
## [111,] 253 0.9562219     
## [112,] 254 0.5528683     
## [113,] 255 0.2014395     
## [114,] 256 0.655442      
## [115,] 257 0.006877678   
## [116,] 258 0.0006264664  
## [117,] 259 0.7785989     
## [118,] 260 0.07112161    
## [119,] 261 0.8481883     
## [120,] 262 0.4452464     
## [121,] 263 0.9315994     
## [122,] 264 0.8451285     
## [123,] 265 0.9716815     
## [124,] 266 0.7129842     
## [125,] 267 0.1644173     
## [126,] 268 0.03273428    
## [127,] 269 0.9902921     
## [128,] 270 0.9496613     
## [129,] 271 0.9579085     
## [130,] 272 0.8863188     
## [131,] 273 0.4208476     
## [132,] 274 0.7171719     
## [133,] 275 0.01355154    
## [134,] 276 0.9920801     
## [135,] 277 0.06343009    
## [136,] 278 0.01073304    
## [137,] 279 0.6630176     
## [138,] 280 0.4948066     
## [139,] 281 0.9708145     
## [140,] 282 0.8897523     
## [141,] 284 0.7440368     
## [142,] 287 0.3178372     
## [143,] 289 0.05144863    
## [144,] 290 0.1967437
each_dist <- each_dist %>% 
  as.data.frame() 
each_dist$DID <- each_dist %>% 
  pull(DID) %>% 
  as.numeric()
each_dist$hoslem_p_value <- each_dist %>% 
  pull(hoslem_p_value) %>% 
  as.numeric()
each_dist

Dists with P value <= 0.05

each_dist %>% 
  filter(hoslem_p_value <= 0.05)
dists_not_fit <- each_dist %>% 
  filter(hoslem_p_value <= 0.05) %>% 
  pull(DID)

Not dists_not_fit. GLM Age >= 5 C003_01 ~ n_children_in_household + PR004_PR009_both_01

glm_child <- glm(C003_01 ~ n_children_in_household + PR004_PR009_both_01, family = "binomial", data = child_ica_dummy %>% filter(C001 >= 5, !DID %in% dists_not_fit))

glm_child %>% summary()
## 
## Call:
## glm(formula = C003_01 ~ n_children_in_household + PR004_PR009_both_01, 
##     family = "binomial", data = child_ica_dummy %>% filter(C001 >= 
##         5, !DID %in% dists_not_fit))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3662   0.3771   0.6920   0.7748   1.1695  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              1.564935   0.016627   94.12   <2e-16 ***
## n_children_in_household -0.128849   0.003823  -33.70   <2e-16 ***
## PR004_PR009_both_01      1.300495   0.017287   75.23   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 185614  on 181260  degrees of freedom
## Residual deviance: 176655  on 181258  degrees of freedom
## AIC: 176661
## 
## Number of Fisher Scoring iterations: 5
Hosmer-Lemeshow
hoslem.test(x = glm_child$y, y = fitted(glm_child))
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  glm_child$y, fitted(glm_child)
## X-squared = 44.192, df = 8, p-value = 5.235e-07